# Pandas / Numpy
import pandas as pd
import numpy as np
# NLTK - 3.8.1
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
# Regular Expressions
import re
import string
# Date/Time
import datetime
# sklearn - 1.2.2
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay, precision_recall_curve
from sklearn.model_selection import cross_val_score, GridSearchCV
# Other Classification Models - LightGBM - 4.0.0 / XGBoost - 1.7.6 / CatBoost - 1.2
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
# Plotting
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
# News API
from newsapi import NewsApiClient
# Markdown
from IPython.display import Markdown, display
# Multi-Processing / Threading
from joblib import parallel_backend
# Download Stopwords
nltk.download("stopwords")
nltk.download("punkt")
nltk.download("wordnet")
# Set stop words
stopWords = stopwords.words("english")
[nltk_data] Downloading package stopwords to [nltk_data] C:\Users\SimonMurrell\AppData\Roaming\nltk_data... [nltk_data] Package stopwords is already up-to-date! [nltk_data] Downloading package punkt to [nltk_data] C:\Users\SimonMurrell\AppData\Roaming\nltk_data... [nltk_data] Package punkt is already up-to-date! [nltk_data] Downloading package wordnet to [nltk_data] C:\Users\SimonMurrell\AppData\Roaming\nltk_data... [nltk_data] Package wordnet is already up-to-date!
# Assign Colours
charcoalColour = "#63666A"
tealColour = "#00ACC9"
magentaColour = "#E6007E"
magentaColour = "#E6007E"
greenColour = "#80BA27"
lightGreenColour = "#B8DA25"
blueColour = "#50A2D4"
emeraldColour = "#05AF9A"
purpleColour = "#7E4CA5"
orangeColour = "#FF8300"
redColour = "#FA4616"
yellowColour = "#F3C317"
# News API Key - 1000 requests per 30 minutes
newsAPIKey = "d8061c628ce241bbb0a34d4b61fcd707"
# Assign Names
classificationName_nb = "Naive Bayes"
classificationName_lr = "Logistic Regression"
classificationName_svm = "Linear SVC"
classificationName_rf = "Random Forest"
classificationName_xg = "XG Boost"
classificationName_lg = "LightGBM"
classificationName_ab = "Adaptive Boosting"
classificationName_cb = "CatBoost"
# Load Real Datasets
real_dataset = pd.read_csv("./dataset/true.csv")
# Check Shape
real_dataset.shape
(21417, 4)
# Load Fake Dataset
fake_dataset = pd.read_csv("./dataset/fake.csv")
# Check Shape
fake_dataset.shape
(23481, 4)
# Check True Dataset
real_dataset.head()
| title | text | subject | date | |
|---|---|---|---|---|
| 0 | As U.S. budget fight looms, Republicans flip t... | WASHINGTON (Reuters) - The head of a conservat... | politicsNews | December 31, 2017 |
| 1 | U.S. military to accept transgender recruits o... | WASHINGTON (Reuters) - Transgender people will... | politicsNews | December 29, 2017 |
| 2 | Senior U.S. Republican senator: 'Let Mr. Muell... | WASHINGTON (Reuters) - The special counsel inv... | politicsNews | December 31, 2017 |
| 3 | FBI Russia probe helped by Australian diplomat... | WASHINGTON (Reuters) - Trump campaign adviser ... | politicsNews | December 30, 2017 |
| 4 | Trump wants Postal Service to charge 'much mor... | SEATTLE/WASHINGTON (Reuters) - President Donal... | politicsNews | December 29, 2017 |
# Check Fake Dataset
fake_dataset.head()
| title | text | subject | date | |
|---|---|---|---|---|
| 0 | Donald Trump Sends Out Embarrassing New Year’... | Donald Trump just couldn t wish all Americans ... | News | December 31, 2017 |
| 1 | Drunk Bragging Trump Staffer Started Russian ... | House Intelligence Committee Chairman Devin Nu... | News | December 31, 2017 |
| 2 | Sheriff David Clarke Becomes An Internet Joke... | On Friday, it was revealed that former Milwauk... | News | December 30, 2017 |
| 3 | Trump Is So Obsessed He Even Has Obama’s Name... | On Christmas day, Donald Trump announced that ... | News | December 29, 2017 |
| 4 | Pope Francis Just Called Out Donald Trump Dur... | Pope Francis used his annual Christmas Day mes... | News | December 25, 2017 |
# Add Column
real_dataset["label"] = 1
fake_dataset["label"] = 0
# Drop Columns
real_dataset = real_dataset.drop("subject", axis=1)
real_dataset = real_dataset.drop("date", axis=1)
fake_dataset = fake_dataset.drop("subject", axis=1)
fake_dataset = fake_dataset.drop("date", axis=1)
# Check Shape
real_dataset.shape
(21417, 3)
# Check True Dataset
real_dataset.head()
| title | text | label | |
|---|---|---|---|
| 0 | As U.S. budget fight looms, Republicans flip t... | WASHINGTON (Reuters) - The head of a conservat... | 1 |
| 1 | U.S. military to accept transgender recruits o... | WASHINGTON (Reuters) - Transgender people will... | 1 |
| 2 | Senior U.S. Republican senator: 'Let Mr. Muell... | WASHINGTON (Reuters) - The special counsel inv... | 1 |
| 3 | FBI Russia probe helped by Australian diplomat... | WASHINGTON (Reuters) - Trump campaign adviser ... | 1 |
| 4 | Trump wants Postal Service to charge 'much mor... | SEATTLE/WASHINGTON (Reuters) - President Donal... | 1 |
# Check Shape
fake_dataset.shape
(23481, 3)
# Check Fake Dataset
fake_dataset.head()
| title | text | label | |
|---|---|---|---|
| 0 | Donald Trump Sends Out Embarrassing New Year’... | Donald Trump just couldn t wish all Americans ... | 0 |
| 1 | Drunk Bragging Trump Staffer Started Russian ... | House Intelligence Committee Chairman Devin Nu... | 0 |
| 2 | Sheriff David Clarke Becomes An Internet Joke... | On Friday, it was revealed that former Milwauk... | 0 |
| 3 | Trump Is So Obsessed He Even Has Obama’s Name... | On Christmas day, Donald Trump announced that ... | 0 |
| 4 | Pope Francis Just Called Out Donald Trump Dur... | Pope Francis used his annual Christmas Day mes... | 0 |
def extractTextAfterHyphen(text):
# Check Value
parts = text.split("-")
# Check Value
if len(parts) > 1:
# Return
return parts[1].strip()
else:
# Return
return text
# Strip out Prefix
real_dataset["text"] = real_dataset["text"].apply(extractTextAfterHyphen)
# Check Real Dataset
real_dataset.head()
| title | text | label | |
|---|---|---|---|
| 0 | As U.S. budget fight looms, Republicans flip t... | The head of a conservative Republican faction ... | 1 |
| 1 | U.S. military to accept transgender recruits o... | Transgender people will be allowed for the fir... | 1 |
| 2 | Senior U.S. Republican senator: 'Let Mr. Muell... | The special counsel investigation of links bet... | 1 |
| 3 | FBI Russia probe helped by Australian diplomat... | Trump campaign adviser George Papadopoulos tol... | 1 |
| 4 | Trump wants Postal Service to charge 'much mor... | President Donald Trump called on the U.S. Post... | 1 |
# Combine Datasets
news_dataset = pd.concat([real_dataset, fake_dataset])
# Check Shape
news_dataset.shape
(44898, 3)
# Check Fake Dataset
news_dataset.head()
| title | text | label | |
|---|---|---|---|
| 0 | As U.S. budget fight looms, Republicans flip t... | The head of a conservative Republican faction ... | 1 |
| 1 | U.S. military to accept transgender recruits o... | Transgender people will be allowed for the fir... | 1 |
| 2 | Senior U.S. Republican senator: 'Let Mr. Muell... | The special counsel investigation of links bet... | 1 |
| 3 | FBI Russia probe helped by Australian diplomat... | Trump campaign adviser George Papadopoulos tol... | 1 |
| 4 | Trump wants Postal Service to charge 'much mor... | President Donald Trump called on the U.S. Post... | 1 |
# Check Info
news_dataset.info()
<class 'pandas.core.frame.DataFrame'> Index: 44898 entries, 0 to 23480 Data columns (total 3 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 title 44898 non-null object 1 text 44898 non-null object 2 label 44898 non-null int64 dtypes: int64(1), object(2) memory usage: 1.4+ MB
# Describe
news_dataset.describe()
| label | |
|---|---|
| count | 44898.000000 |
| mean | 0.477015 |
| std | 0.499477 |
| min | 0.000000 |
| 25% | 0.000000 |
| 50% | 0.000000 |
| 75% | 1.000000 |
| max | 1.000000 |
# Get Label Value Counts
labelCounts = news_dataset["label"].value_counts()
# Get Label Mappings
labelMappings = {0: "Fake News", 1: "Real News"}
# Setup Colours
colours = [charcoalColour, tealColour]
# Create Pie Chart
plt.pie(labelCounts, labels=labelCounts.index.map(labelMappings), autopct="%1.1f%%", colors=colours)
# Set Legend
plt.legend(title="News Type", labels=["Fake News", "Real News"], loc="center left", bbox_to_anchor=(1, 0.5))
# Set Title
plt.title("Real vs Fake News")
# Show
plt.show()
# Check for missing values
missing_values = news_dataset.isnull().sum()
# Print
missing_values
title 0 text 0 label 0 dtype: int64
# Check if there is any unrealiable data
# Print Missing Values
print(f'Missing Values')
print(f'Title: {missing_values.title}')
print(f'Text: {missing_values.text}')
print(f'Label: {missing_values.label}')
# Drop NA
news_dataset = news_dataset.dropna()
# Get Some Stats
news_dataset.describe()
Missing Values Title: 0 Text: 0 Label: 0
| label | |
|---|---|
| count | 44898.000000 |
| mean | 0.477015 |
| std | 0.499477 |
| min | 0.000000 |
| 25% | 0.000000 |
| 50% | 0.000000 |
| 75% | 1.000000 |
| max | 1.000000 |
# Print Real News Head
news_dataset.head()
| title | text | label | |
|---|---|---|---|
| 0 | As U.S. budget fight looms, Republicans flip t... | The head of a conservative Republican faction ... | 1 |
| 1 | U.S. military to accept transgender recruits o... | Transgender people will be allowed for the fir... | 1 |
| 2 | Senior U.S. Republican senator: 'Let Mr. Muell... | The special counsel investigation of links bet... | 1 |
| 3 | FBI Russia probe helped by Australian diplomat... | Trump campaign adviser George Papadopoulos tol... | 1 |
| 4 | Trump wants Postal Service to charge 'much mor... | President Donald Trump called on the U.S. Post... | 1 |
# Print Fake News Head
news_dataset[news_dataset["label"] == 0].head()
| title | text | label | |
|---|---|---|---|
| 0 | Donald Trump Sends Out Embarrassing New Year’... | Donald Trump just couldn t wish all Americans ... | 0 |
| 1 | Drunk Bragging Trump Staffer Started Russian ... | House Intelligence Committee Chairman Devin Nu... | 0 |
| 2 | Sheriff David Clarke Becomes An Internet Joke... | On Friday, it was revealed that former Milwauk... | 0 |
| 3 | Trump Is So Obsessed He Even Has Obama’s Name... | On Christmas day, Donald Trump announced that ... | 0 |
| 4 | Pope Francis Just Called Out Donald Trump Dur... | Pope Francis used his annual Christmas Day mes... | 0 |
# Lowercase words
news_dataset["title"] = news_dataset["title"].str.lower()
news_dataset["text"] = news_dataset["text"].str.lower()
# Print Real News Head
news_dataset.head()
| title | text | label | |
|---|---|---|---|
| 0 | as u.s. budget fight looms, republicans flip t... | the head of a conservative republican faction ... | 1 |
| 1 | u.s. military to accept transgender recruits o... | transgender people will be allowed for the fir... | 1 |
| 2 | senior u.s. republican senator: 'let mr. muell... | the special counsel investigation of links bet... | 1 |
| 3 | fbi russia probe helped by australian diplomat... | trump campaign adviser george papadopoulos tol... | 1 |
| 4 | trump wants postal service to charge 'much mor... | president donald trump called on the u.s. post... | 1 |
# Print Fake News Head
news_dataset[news_dataset["label"] == 0].head()
| title | text | label | |
|---|---|---|---|
| 0 | donald trump sends out embarrassing new year’... | donald trump just couldn t wish all americans ... | 0 |
| 1 | drunk bragging trump staffer started russian ... | house intelligence committee chairman devin nu... | 0 |
| 2 | sheriff david clarke becomes an internet joke... | on friday, it was revealed that former milwauk... | 0 |
| 3 | trump is so obsessed he even has obama’s name... | on christmas day, donald trump announced that ... | 0 |
| 4 | pope francis just called out donald trump dur... | pope francis used his annual christmas day mes... | 0 |
def removeUrls(text):
# Setup Pattern
pattern = r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+"
# Sub
retVal = re.sub(pattern, "", text)
# Return
return retVal
# Remove URLs
news_dataset["title"] = news_dataset["title"].apply(removeUrls)
news_dataset["text"] = news_dataset["text"].apply(removeUrls)
# Print Real News Head
news_dataset.head()
| title | text | label | |
|---|---|---|---|
| 0 | as u.s. budget fight looms, republicans flip t... | the head of a conservative republican faction ... | 1 |
| 1 | u.s. military to accept transgender recruits o... | transgender people will be allowed for the fir... | 1 |
| 2 | senior u.s. republican senator: 'let mr. muell... | the special counsel investigation of links bet... | 1 |
| 3 | fbi russia probe helped by australian diplomat... | trump campaign adviser george papadopoulos tol... | 1 |
| 4 | trump wants postal service to charge 'much mor... | president donald trump called on the u.s. post... | 1 |
# Print Fake News Head
news_dataset[news_dataset["label"] == 0].head()
| title | text | label | |
|---|---|---|---|
| 0 | donald trump sends out embarrassing new year’... | donald trump just couldn t wish all americans ... | 0 |
| 1 | drunk bragging trump staffer started russian ... | house intelligence committee chairman devin nu... | 0 |
| 2 | sheriff david clarke becomes an internet joke... | on friday, it was revealed that former milwauk... | 0 |
| 3 | trump is so obsessed he even has obama’s name... | on christmas day, donald trump announced that ... | 0 |
| 4 | pope francis just called out donald trump dur... | pope francis used his annual christmas day mes... | 0 |
# Remove Punctuation
removepunctuation = str.maketrans("", "", string.punctuation)
news_dataset["title"] = news_dataset["title"].str.translate(removepunctuation)
news_dataset["text"] = news_dataset["text"].str.translate(removepunctuation)
# Print Real News Head
news_dataset.head()
| title | text | label | |
|---|---|---|---|
| 0 | as us budget fight looms republicans flip thei... | the head of a conservative republican faction ... | 1 |
| 1 | us military to accept transgender recruits on ... | transgender people will be allowed for the fir... | 1 |
| 2 | senior us republican senator let mr mueller do... | the special counsel investigation of links bet... | 1 |
| 3 | fbi russia probe helped by australian diplomat... | trump campaign adviser george papadopoulos tol... | 1 |
| 4 | trump wants postal service to charge much more... | president donald trump called on the us postal... | 1 |
# Print Fake News Head
news_dataset[news_dataset["label"] == 0].head()
| title | text | label | |
|---|---|---|---|
| 0 | donald trump sends out embarrassing new year’... | donald trump just couldn t wish all americans ... | 0 |
| 1 | drunk bragging trump staffer started russian ... | house intelligence committee chairman devin nu... | 0 |
| 2 | sheriff david clarke becomes an internet joke... | on friday it was revealed that former milwauke... | 0 |
| 3 | trump is so obsessed he even has obama’s name... | on christmas day donald trump announced that h... | 0 |
| 4 | pope francis just called out donald trump dur... | pope francis used his annual christmas day mes... | 0 |
# Remove Numbers
news_dataset["title"] = news_dataset["title"].str.replace(r"\d+", "", regex=True)
news_dataset["text"] = news_dataset["text"].str.replace(r"\d+", "", regex=True)
# Print Real News Head
news_dataset.head()
| title | text | label | |
|---|---|---|---|
| 0 | as us budget fight looms republicans flip thei... | the head of a conservative republican faction ... | 1 |
| 1 | us military to accept transgender recruits on ... | transgender people will be allowed for the fir... | 1 |
| 2 | senior us republican senator let mr mueller do... | the special counsel investigation of links bet... | 1 |
| 3 | fbi russia probe helped by australian diplomat... | trump campaign adviser george papadopoulos tol... | 1 |
| 4 | trump wants postal service to charge much more... | president donald trump called on the us postal... | 1 |
# Print Fake News Head
news_dataset[news_dataset["label"] == 0].head()
| title | text | label | |
|---|---|---|---|
| 0 | donald trump sends out embarrassing new year’... | donald trump just couldn t wish all americans ... | 0 |
| 1 | drunk bragging trump staffer started russian ... | house intelligence committee chairman devin nu... | 0 |
| 2 | sheriff david clarke becomes an internet joke... | on friday it was revealed that former milwauke... | 0 |
| 3 | trump is so obsessed he even has obama’s name... | on christmas day donald trump announced that h... | 0 |
| 4 | pope francis just called out donald trump dur... | pope francis used his annual christmas day mes... | 0 |
def removeUnicode(text):
# Return
return text.encode("ascii", "ignore").decode("ascii")
# Remove Unicode
news_dataset["title"] = news_dataset["title"].apply(removeUnicode)
news_dataset["text"] = news_dataset["text"].apply(removeUnicode)
# Print Real News Head
news_dataset.head()
| title | text | label | |
|---|---|---|---|
| 0 | as us budget fight looms republicans flip thei... | the head of a conservative republican faction ... | 1 |
| 1 | us military to accept transgender recruits on ... | transgender people will be allowed for the fir... | 1 |
| 2 | senior us republican senator let mr mueller do... | the special counsel investigation of links bet... | 1 |
| 3 | fbi russia probe helped by australian diplomat... | trump campaign adviser george papadopoulos tol... | 1 |
| 4 | trump wants postal service to charge much more... | president donald trump called on the us postal... | 1 |
# Print Fake News Head
news_dataset[news_dataset["label"] == 0].head()
| title | text | label | |
|---|---|---|---|
| 0 | donald trump sends out embarrassing new years... | donald trump just couldn t wish all americans ... | 0 |
| 1 | drunk bragging trump staffer started russian ... | house intelligence committee chairman devin nu... | 0 |
| 2 | sheriff david clarke becomes an internet joke... | on friday it was revealed that former milwauke... | 0 |
| 3 | trump is so obsessed he even has obamas name ... | on christmas day donald trump announced that h... | 0 |
| 4 | pope francis just called out donald trump dur... | pope francis used his annual christmas day mes... | 0 |
# Initialize WordNet Lemmatizer
lemmatizer = WordNetLemmatizer()
def lemmatizeText(text):
# Extract Words from Text
words = word_tokenize(text)
# Lemmatize Words
lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
# Rejoin Words
retVal = " ".join(lemmatized_words)
# Return
return retVal
# Lemmatize Words
news_dataset["title"] = news_dataset["title"].apply(lemmatizeText)
news_dataset["text"] = news_dataset["text"].apply(lemmatizeText)
# Print Real News Head
news_dataset.head()
| title | text | label | |
|---|---|---|---|
| 0 | a u budget fight loom republican flip their fi... | the head of a conservative republican faction ... | 1 |
| 1 | u military to accept transgender recruit on mo... | transgender people will be allowed for the fir... | 1 |
| 2 | senior u republican senator let mr mueller do ... | the special counsel investigation of link betw... | 1 |
| 3 | fbi russia probe helped by australian diplomat... | trump campaign adviser george papadopoulos tol... | 1 |
| 4 | trump want postal service to charge much more ... | president donald trump called on the u postal ... | 1 |
# Print Fake News Head
news_dataset[news_dataset["label"] == 0].head()
| title | text | label | |
|---|---|---|---|
| 0 | donald trump sends out embarrassing new year e... | donald trump just couldn t wish all american a... | 0 |
| 1 | drunk bragging trump staffer started russian c... | house intelligence committee chairman devin nu... | 0 |
| 2 | sheriff david clarke becomes an internet joke ... | on friday it wa revealed that former milwaukee... | 0 |
| 3 | trump is so obsessed he even ha obamas name co... | on christmas day donald trump announced that h... | 0 |
| 4 | pope francis just called out donald trump duri... | pope francis used his annual christmas day mes... | 0 |
# Current techniques used in full project:
# Remove unreliable data
# Lowercase words
# Remove URLs
# Remove punctuation
# Remove Numbers
# Remove Unicode Characters
# Lemmatization / Stemming (Not Applied)
# Stop word removal - Not used as it reduced performance
# Setup Count Vectorizer
titleRealWordCounter = CountVectorizer()
# Fit / Transform Title
titleRealBow = titleRealWordCounter.fit_transform(news_dataset.loc[news_dataset["label"] == 1, "title"])
# Extract Feature Names
titleRealFeatureNames = titleRealWordCounter.get_feature_names_out()
# Get Word Counts
titleRealWordCount = titleRealBow.sum(axis=0)
# Setup Dictionary
titleRealWordCountDict = dict(zip(titleRealFeatureNames, titleRealWordCount.tolist()[0]))
# Sort Word Count Dictionary
titleRealSortedWordCountDict = sorted(titleRealWordCountDict.items(), key=lambda x: x[1], reverse=True)
# Generate DataFrame
dfTitleRealWords = pd.DataFrame(titleRealSortedWordCountDict)
# Assign Values
dfTitleRealWords = dfTitleRealWords.rename(columns={0: "word"})
dfTitleRealWords = dfTitleRealWords.rename(columns={1: "count"})
dfTitleRealWords["type"] = "title"
dfTitleRealWords["label"] = 1
# Setup Count Vectorizer
titleFakeWordCounter = CountVectorizer()
# Fit / Transform Title
titleFakeBow = titleFakeWordCounter.fit_transform(news_dataset.loc[news_dataset["label"] == 0, "title"])
# Extract Feature Names
titleFakeFeatureNames = titleFakeWordCounter.get_feature_names_out()
# Get Word Counts
titleFakeWordCount = titleFakeBow.sum(axis=0)
# Setup Dictionary
titleFakeWordCountDict = dict(zip(titleFakeFeatureNames, titleFakeWordCount.tolist()[0]))
# Sort Word Count Dictionary
titleFakeSortedWordCountDict = sorted(titleFakeWordCountDict.items(), key=lambda x: x[1], reverse=True)
# Generate DataFrame
dfTitleFakeWords = pd.DataFrame(titleFakeSortedWordCountDict)
# Assign Values
dfTitleFakeWords = dfTitleFakeWords.rename(columns={0: "word"})
dfTitleFakeWords = dfTitleFakeWords.rename(columns={1: "count"})
dfTitleFakeWords["type"] = "title"
dfTitleFakeWords["label"] = 0
# Setup Count Vectorizer
textRealWordCounter = CountVectorizer()
# Fit / Transform Text
textRealBow = textRealWordCounter.fit_transform(news_dataset.loc[news_dataset["label"] == 1, "text"])
# Extract Feature Names
textRealFeatureNames = textRealWordCounter.get_feature_names_out()
# Get Word Counts
textRealWordCount = textRealBow.sum(axis=0)
# Setup Dictionary
textRealWordCountDict = dict(zip(textRealFeatureNames, textRealWordCount.tolist()[0]))
# Sort Word Count Dictionary
textRealSortedWordCountDict = sorted(textRealWordCountDict.items(), key=lambda x: x[1], reverse=True)
# Generate DataFrame
dfTextRealWords = pd.DataFrame(textRealSortedWordCountDict)
# Assign Values
dfTextRealWords = dfTextRealWords.rename(columns={0: "word"})
dfTextRealWords = dfTextRealWords.rename(columns={1: "count"})
dfTextRealWords["word_length"] = dfTextRealWords["word"].apply(lambda x: len(x))
dfTextRealWords["type"] = "text"
dfTextRealWords["label"] = 1
# Setup Count Vectorizer
textFakeWordCounter = CountVectorizer()
# Fit / Transform Text
textFakeBow = textFakeWordCounter.fit_transform(news_dataset.loc[news_dataset["label"] == 0, "text"])
# Extract Feature Names
textFakeFeatureNames = textFakeWordCounter.get_feature_names_out()
# Get Word Counts
textFakeWordCount = textFakeBow.sum(axis=0)
# Setup Dictionary
textFakeWordCountDict = dict(zip(textFakeFeatureNames, textFakeWordCount.tolist()[0]))
# Sort Word Count Dictionary
textFakeSortedWordCountDict = sorted(textFakeWordCountDict.items(), key=lambda x: x[1], reverse=True)
# Generate DataFrame
dfTextFakeWords = pd.DataFrame(textFakeSortedWordCountDict)
# Assign Values
dfTextFakeWords = dfTextFakeWords.rename(columns={0: "word"})
dfTextFakeWords = dfTextFakeWords.rename(columns={1: "count"})
dfTextFakeWords["type"] = "text"
dfTextFakeWords["label"] = 0
# Get Title Words
dfTitleWords = pd.merge(dfTitleRealWords, dfTitleFakeWords, on="word", how="outer")
# Drop Columns
dfTitleWords = dfTitleWords.drop("type_x", axis=1)
dfTitleWords = dfTitleWords.drop("label_x", axis=1)
dfTitleWords = dfTitleWords.drop("type_y", axis=1)
dfTitleWords = dfTitleWords.drop("label_y", axis=1)
# Rename Columns
dfTitleWords = dfTitleWords.rename(columns={"count_x": "real_count"})
dfTitleWords = dfTitleWords.rename(columns={"count_y": "fake_count"})
# Remove stop words
dfTitleWords["word"] = dfTitleWords["word"].apply(lambda x: " ".join([word for word in x.split() if word.lower() not in stopWords]))
dfTitleWords = dfTitleWords[dfTitleWords["word"] != ""]
# Drop NA
dfTitleWords = dfTitleWords.dropna()
# Set Types
dfTitleWords["real_count"] = dfTitleWords["real_count"].astype(int)
dfTitleWords["fake_count"] = dfTitleWords["fake_count"].astype(int)
# Add Column
dfTitleWords["total_count"] = dfTitleWords["real_count"] + dfTitleWords["fake_count"]
# Sort
dfTitleWords = dfTitleWords.sort_values(by="total_count", ascending=False)
# Get Text Words
dfTextWords = pd.merge(dfTextRealWords, dfTextFakeWords, on="word", how="outer")
# Drop Columns
dfTextWords = dfTextWords.drop("type_x", axis=1)
dfTextWords = dfTextWords.drop("label_x", axis=1)
dfTextWords = dfTextWords.drop("type_y", axis=1)
dfTextWords = dfTextWords.drop("label_y", axis=1)
# Rename Columns
dfTextWords = dfTextWords.rename(columns={"count_x": "real_count"})
dfTextWords = dfTextWords.rename(columns={"count_y": "fake_count"})
# Remove stop words
dfTextWords["word"] = dfTextWords["word"].apply(lambda x: " ".join([word for word in x.split() if word.lower() not in stopWords]))
dfTextWords = dfTextWords[dfTextWords["word"] != ""]
# Drop NA
dfTextWords = dfTextWords.dropna()
# Set Types
dfTextWords["real_count"] = dfTextWords["real_count"].astype(int)
dfTextWords["fake_count"] = dfTextWords["fake_count"].astype(int)
# Add Column
dfTextWords["total_count"] = dfTextWords["real_count"] + dfTextWords["fake_count"]
# Sort
dfTextWords = dfTextWords.sort_values(by="total_count", ascending=False)
# Get Real Titles
dfRealTitles = news_dataset[news_dataset["label"] == 1]
# Drop Columns
dfRealTitles = dfRealTitles.drop("text", axis=1)
dfRealTitles = dfRealTitles.drop("label", axis=1)
# Add Column
dfRealTitles["title_length"] = dfRealTitles["title"].apply(lambda x: len(x))
# Remove Rubbish
dfRealTitles = dfRealTitles[dfRealTitles["title"] != ""]
# Sort
dfRealTitles = dfRealTitles.sort_values(by="title_length", ascending=True)
# Get Fake Titles
dfFakeTitles = news_dataset[news_dataset["label"] == 0]
# Drop Columns
dfFakeTitles = dfFakeTitles.drop("text", axis=1)
dfFakeTitles = dfFakeTitles.drop("label", axis=1)
# Add Column
dfFakeTitles["title_length"] = dfFakeTitles["title"].apply(lambda x: len(x))
# Remove Rubbish
dfFakeTitles = dfFakeTitles[dfFakeTitles["title"] != ""]
# Sort
dfFakeTitles = dfFakeTitles.sort_values(by="title_length", ascending=True)
# Get Real Title Length Average / Size
dfRealTitleLengths = dfRealTitles.groupby("title_length").size().reset_index(name="count")
# Get Real Title Length Average / Size
dfFakeTitleLengths = dfFakeTitles.groupby("title_length").size().reset_index(name="count")
# Set Lines
plt.plot(dfRealTitleLengths["title_length"], dfRealTitleLengths["count"], label="Real", c=charcoalColour)
plt.plot(dfFakeTitleLengths["title_length"], dfFakeTitleLengths["count"], label="Fake", c=tealColour)
# Set Title
plt.title("Article Title Lengths - Real vs Fake")
plt.xlabel("Title Lengths")
plt.ylabel("Count")
# Set Legend
plt.legend()
# Show
plt.show()
# Get Real Text
dfRealText = news_dataset[news_dataset["label"] == 1]
# Drop Columns
dfRealText = dfRealText.drop("title", axis=1)
dfRealText = dfRealText.drop("label", axis=1)
# Add Column
dfRealText["text_length"] = dfRealText["text"].apply(lambda x: len(x))
# Remove Rubbish
dfRealText = dfRealText[dfRealText["text_length"] > 250]
# Sort
dfRealText = dfRealText.sort_values(by="text_length", ascending=True)
# Get Fake Text
dfFakeText = news_dataset[news_dataset["label"] == 0]
# Drop Columns
dfFakeText = dfFakeText.drop("title", axis=1)
dfFakeText = dfFakeText.drop("label", axis=1)
# Add Column
dfFakeText["text_length"] = dfFakeText["text"].apply(lambda x: len(x))
# Remove Rubbish
dfFakeText = dfFakeText[dfFakeText["text_length"] > 250]
# Sort
dfFakeText = dfFakeText.sort_values(by="text_length", ascending=True)
# Get Real Text Length Average / Size
dfRealTextLengths = dfRealText.groupby("text_length").size().reset_index(name="count")
# Get Real Text Length Average / Size
dfFakeTextLengths = dfFakeText.groupby("text_length").size().reset_index(name="count")
# Set Scatter
plt.scatter(dfRealTextLengths["text_length"], dfRealTextLengths["count"], label="Real", c=charcoalColour)
plt.scatter(dfFakeTextLengths["text_length"], dfFakeTextLengths["count"], label="Fake", c=tealColour)
# Set Title
plt.title("Article Text Lengths - Real vs Fake")
plt.xlabel("Text Lengths")
plt.ylabel("Count")
# Set Legend
plt.legend()
# Show
plt.show()
# Get Real Titles
dfRealTitles = dfTitleRealWords
# Drop Columns
dfRealTitles = dfRealTitles.drop("count", axis=1)
dfRealTitles = dfRealTitles.drop("type", axis=1)
dfRealTitles = dfRealTitles.drop("label", axis=1)
# Add Column
dfRealTitles["word_length"] = dfRealTitles["word"].apply(lambda x: len(x))
# Sort
dfRealTitles = dfRealTitles.sort_values(by="word_length", ascending=True)
# Get Fake Titles
dfFakeTitles = dfTitleFakeWords
# Drop Columns
dfFakeTitles = dfFakeTitles.drop("count", axis=1)
dfFakeTitles = dfFakeTitles.drop("type", axis=1)
dfFakeTitles = dfFakeTitles.drop("label", axis=1)
# Add Column
dfFakeTitles["word_length"] = dfFakeTitles["word"].apply(lambda x: len(x))
# Sort
dfFakeTitles = dfFakeTitles.sort_values(by="word_length", ascending=True)
# Get Real Word Length Average / Size
dfRealWordLengths = dfRealTitles.groupby("word_length").size().reset_index(name="count")
# Get Real Word Length Average / Size
dfFakeWordLengths = dfFakeTitles.groupby("word_length").size().reset_index(name="count")
# Set Lines
plt.plot(dfRealWordLengths["word_length"], dfRealWordLengths["count"], label="Real", c=charcoalColour)
plt.plot(dfFakeWordLengths["word_length"], dfFakeWordLengths["count"], label="Fake", c=tealColour)
# Set Title
plt.title("Word Lengths - Real vs Fake")
plt.xlabel("Word Lengths")
plt.ylabel("Count")
# Set Legend
plt.legend()
# Show
plt.show()
# Assign Values
words = dfTitleWords["word"][:25]
real_counts = dfTitleWords["real_count"][:25]
fake_counts = dfTitleWords["fake_count"][:25]
# Set up horizontal bars
plt.barh(words, real_counts, label="Real Words", color=charcoalColour)
plt.barh(words, fake_counts, left=real_counts, label="Fake Words", color=tealColour)
# Set Titles
plt.xlabel("Counts")
plt.ylabel("Words")
plt.title("Top 25 Real vs Fake Words in Article Title")
# Set Legend
plt.legend()
# Show
plt.show()
# Assign Values
words = dfTextWords["word"][:25]
real_counts = dfTextWords["real_count"][:25]
fake_counts = dfTextWords["fake_count"][:25]
# Set up horizontal bars
plt.barh(words, real_counts, label="Real Words", color=charcoalColour)
plt.barh(words, fake_counts, left=real_counts, label="Fake Words", color=tealColour)
# Set Titles
plt.xlabel("Counts")
plt.ylabel("Words")
plt.title("Top 25 Real vs Fake Words in Article Text")
# Set Legend
plt.legend()
# Show
plt.show()
# Setup Label Encoder
labelEncoder = LabelEncoder()
# Train Dataset
X = news_dataset["title"] + " " + news_dataset["text"]
y = news_dataset["label"]
# Fit / Transform
y = labelEncoder.fit_transform(y)
# Train / Test Split based on 80/20 split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
# Setup Labels
labels = ["Fake", "Real"]
def buildPipeline(classifier, parameters_grid):
# Print Status
#print(f"Building Pipeline - " + datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
# Setup TfIdfVectorizer
tfidfVectorizer = TfidfVectorizer(max_features=15000, ngram_range=(1, 2), min_df=2, max_df=0.6, sublinear_tf=True)
# Setup Pipeline
model = Pipeline(steps=[
("tfidfvectorizer", tfidfVectorizer),
("model", classifier)
])
# Fit Model
model.fit(X_train, y_train)
# Check Value
if (parameters_grid is not None):
# Run across multi-processors
with parallel_backend("multiprocessing"):
# Setup Grid Search CV
grid_search = GridSearchCV(model, parameters_grid, n_jobs=-1)
# Fit Model
grid_search.fit(X_train, y_train)
# Re-assign Modal
model = grid_search.best_estimator_
# Predict
y_pred = model.predict(X_test)
# Print Status
#print(f"Built Pipeline - " + datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
# Return
return tfidfVectorizer, classifier, model, y_pred
def buildConfusionMatrix(y_test, y_pred):
# Setup Confusion Matrix
confusionMatrix = confusion_matrix(y_test, y_pred)
# Switch
confusionMatrix = [confusionMatrix[1], confusionMatrix[0]]
# Setup Colours
colours = [charcoalColour, tealColour]
# Setup Heatmap
figure_confusionMatrix = go.Figure(data=go.Heatmap(z=confusionMatrix,
x=labels,
y=labels[::-1],
colorscale=colours,
hoverinfo="z",
showscale=False,
text=confusionMatrix))
# Build Classification Report
classificationReport = classification_report(y_test, y_pred, output_dict=True)
# Assign Accuracy
accuracy = accuracy_score(y_test, y_pred)
# Print Output Metrics
print(classification_report(y_test, y_pred, target_names=labels))
# Print Accuracy
print("Accuracy:", accuracy)
# Return
return confusionMatrix, figure_confusionMatrix, classificationReport, accuracy
def appendMetrics(name, classificationReport, metrics):
# Check Value
if (name == "Naive Bayes"):
# Setup Results
metrics = pd.DataFrame({
"Name": name,
"Accuracy": str(round(classificationReport["accuracy"] * 100, 4)),
"Precision": str(round(classificationReport["macro avg"]["precision"] * 100, 4)),
"Recall": str(round(classificationReport["macro avg"]["recall"] * 100, 4)),
"F1-Score": str(round(classificationReport["macro avg"]["f1-score"] * 100, 4))
}, index=[0])
else:
# Setup Results
newRow = pd.Series({
"Name": name,
"Accuracy": str(round(classificationReport["accuracy"] * 100, 4)),
"Precision": str(round(classificationReport["macro avg"]["precision"] * 100, 4)),
"Recall": str(round(classificationReport["macro avg"]["recall"] * 100, 4)),
"F1-Score": str(round(classificationReport["macro avg"]["f1-score"] * 100, 4))
})
# Append Results
metrics = pd.concat([metrics, newRow.to_frame().T], ignore_index=True)
# Return
return metrics
# Build Naive Bayes Pipeline
tfidfVectorizer_nb, classifier_nb, model_nb, y_pred_nb = buildPipeline(MultinomialNB(alpha=0.1, fit_prior=False), None)
# Build Confusion Matrix Heat Map Figure
confusionMatrix_nb, fig_cm_nb, classification_nb, accuracy_nb = buildConfusionMatrix(y_test, y_pred_nb)
# Append Metrics
metrics = appendMetrics(classificationName_nb, classification_nb, None)
precision recall f1-score support
Fake 0.96 0.97 0.96 4686
Real 0.97 0.95 0.96 4294
accuracy 0.96 8980
macro avg 0.96 0.96 0.96 8980
weighted avg 0.96 0.96 0.96 8980
Accuracy: 0.9628062360801781
# Build Logistic Regession Pipeline
tfidfVectorizer_lr, classifier_lr, model_lr, y_pred_lr = buildPipeline(LogisticRegression(C=50, n_jobs=-1, solver="liblinear"), None)
# Build Confusion Matrix Heat Map Figure
confusionMatrix_lr, fig_cm_lr, classification_lr, accuracy_lr = buildConfusionMatrix(y_test, y_pred_lr)
# Append Metrics
metrics = appendMetrics(classificationName_lr, classification_lr, metrics)
C:\Users\SimonMurrell\anaconda3\envs\nlp\Lib\site-packages\sklearn\linear_model\_logistic.py:1222: UserWarning: 'n_jobs' > 1 does not have any effect when 'solver' is set to 'liblinear'. Got 'n_jobs' = 20.
precision recall f1-score support
Fake 0.99 1.00 1.00 4686
Real 1.00 0.99 0.99 4294
accuracy 0.99 8980
macro avg 0.99 0.99 0.99 8980
weighted avg 0.99 0.99 0.99 8980
Accuracy: 0.9948775055679288
# Build Linear SVM Pipeline
tfidfVectorizer_svm, classifier_svm, model_svm, y_pred_svm = buildPipeline(SVC(C=2, kernel="linear"), None)
# Build Confusion Matrix Heat Map Figure
confusionMatrix_svm, fig_cm_svm, classification_svm, accuracy_svm = buildConfusionMatrix(y_test, y_pred_svm)
# Append Metrics
metrics = appendMetrics(classificationName_svm, classification_svm, metrics)
precision recall f1-score support
Fake 1.00 0.99 0.99 4686
Real 0.99 1.00 0.99 4294
accuracy 0.99 8980
macro avg 0.99 0.99 0.99 8980
weighted avg 0.99 0.99 0.99 8980
Accuracy: 0.9944320712694877
# Build Random Forest Pipeline
tfidfVectorizer_rf, classifier_rf, model_rf, y_pred_rf = buildPipeline(RandomForestClassifier(n_estimators=150, max_depth=500, n_jobs=-1), None)
# Build Confusion Matrix Heat Map Figumre
confusionMatrix_rf, fig_cm_rf, classification_rf, accuracy_rf = buildConfusionMatrix(y_test, y_pred_rf)
# Append Metrics
metrics = appendMetrics(classificationName_rf, classification_rf, metrics)
precision recall f1-score support
Fake 0.97 0.99 0.98 4686
Real 0.99 0.97 0.98 4294
accuracy 0.98 8980
macro avg 0.98 0.98 0.98 8980
weighted avg 0.98 0.98 0.98 8980
Accuracy: 0.9791759465478842
# Build XG Boost Pipeline
tfidfVectorizer_xg, classifier_xg, model_xg, y_pred_xg = buildPipeline(XGBClassifier(max_depth=7, n_estimators=250), None)
# Build Confusion Matrix Heat Map Figure
confusionMatrix_xg, fig_cm_xg, classification_xg, accuracy_xg = buildConfusionMatrix(y_test, y_pred_xg)
# Append Metrics
metrics = appendMetrics(classificationName_xg, classification_xg, metrics)
precision recall f1-score support
Fake 0.99 0.99 0.99 4686
Real 0.99 0.99 0.99 4294
accuracy 0.99 8980
macro avg 0.99 0.99 0.99 8980
weighted avg 0.99 0.99 0.99 8980
Accuracy: 0.9909799554565701
# Build LightGBM Pipeline
tfidfVectorizer_lg, classifier_lg, model_lg, y_pred_lg = buildPipeline(LGBMClassifier(boosting_type="gbdt", learning_rate=0.2, n_estimators=200, num_leaves=20), None)
# Build Confusion Matrix Heat Map Figure
confusionMatrix_lg, fig_cm_lg, classification_lg, accuracy_lg = buildConfusionMatrix(y_test, y_pred_lg)
# Append Metrics
metrics = appendMetrics(classificationName_lg, classification_lg, metrics)
[LightGBM] [Info] Number of positive: 17123, number of negative: 18795
[LightGBM] [Warning] Auto-choosing col-wise multi-threading, the overhead of testing was 2.682172 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1421730
[LightGBM] [Info] Number of data points in the train set: 35918, number of used features: 14996
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.476725 -> initscore=-0.093168
[LightGBM] [Info] Start training from score -0.093168
precision recall f1-score support
Fake 0.99 0.99 0.99 4686
Real 0.99 0.99 0.99 4294
accuracy 0.99 8980
macro avg 0.99 0.99 0.99 8980
weighted avg 0.99 0.99 0.99 8980
Accuracy: 0.9926503340757238
# Build Adaptive Boosting Pipeline
tfidfVectorizer_ab, classifier_ab, model_ab, y_pred_ab = buildPipeline(AdaBoostClassifier(learning_rate=0.5, n_estimators=500), None)
# Build Confusion Matrix Heat Map Figure
confusionMatrix_ab, fig_cm_ab, classification_ab, accuracy_ab = buildConfusionMatrix(y_test, y_pred_ab)
# Append Metrics
metrics = appendMetrics(classificationName_ab, classification_ab, metrics)
precision recall f1-score support
Fake 0.99 0.99 0.99 4686
Real 0.99 0.99 0.99 4294
accuracy 0.99 8980
macro avg 0.99 0.99 0.99 8980
weighted avg 0.99 0.99 0.99 8980
Accuracy: 0.9913140311804008
# Build Cat Boosting Pipeline
tfidfVectorizer_cb, classifier_cb, model_cb, y_pred_cb = buildPipeline(CatBoostClassifier(learning_rate=0.1, iterations=1000, depth=6, verbose=False), None)
# Build Confusion Matrix Heat Map Figure
confusionMatrix_cb, fig_cm_cb, classification_cb, accuracy_cb = buildConfusionMatrix(y_test, y_pred_cb)
# Append Metrics
metrics = appendMetrics(classificationName_cb, classification_cb, metrics)
precision recall f1-score support
Fake 0.99 0.99 0.99 4686
Real 0.99 0.99 0.99 4294
accuracy 0.99 8980
macro avg 0.99 0.99 0.99 8980
weighted avg 0.99 0.99 0.99 8980
Accuracy: 0.9902004454342984
def addConfusionMatrixFigure(fig, figure, row, col):
# Add Trace
fig.add_trace(figure, row, col)
# Add Annotations
annotations = figure["text"]
# Search through Rows
for i in range(len(annotations)):
# Search through Rows
for j in range(len(annotations[i])):
# Add Annotation
fig.add_annotation(
x=j,
y=i,
text=str(annotations[i][j]),
showarrow=False,
font=dict(color="white", size=12),
xref="x",
yref="y",
row=row,
col=col
)
# Set up Sub Plots
fig = make_subplots(rows=4,
cols=2,
subplot_titles=(classificationName_nb,
classificationName_lr,
classificationName_svm,
classificationName_rf,
classificationName_xg,
classificationName_lg,
classificationName_ab,
classificationName_cb))
# Add
addConfusionMatrixFigure(fig, fig_cm_nb.data[0], 1, 1)
addConfusionMatrixFigure(fig, fig_cm_lr.data[0], row=1, col=2)
addConfusionMatrixFigure(fig, fig_cm_svm.data[0], row=2, col=1)
addConfusionMatrixFigure(fig, fig_cm_rf.data[0], row=2, col=2)
addConfusionMatrixFigure(fig, fig_cm_xg.data[0], row=3, col=1)
addConfusionMatrixFigure(fig, fig_cm_lg.data[0], row=3, col=2)
addConfusionMatrixFigure(fig, fig_cm_ab.data[0], row=4, col=1)
addConfusionMatrixFigure(fig, fig_cm_cb.data[0], row=4, col=2)
# Update Title
fig.update_layout(title="Classification Models Confusion Matrices",
title_x=0.5,
height=1000)
# Show Chart
fig.show()
# Update Data Frame
metrics["Accuracy"] = metrics["Accuracy"].astype(float)
metrics["Precision"] = metrics["Precision"].astype(float)
metrics["Recall"] = metrics["Recall"].astype(float)
metrics["F1-Score"] = metrics["F1-Score"].astype(float)
# Sort Results
metrics = metrics.copy().sort_values(by=["Accuracy"], ascending=False)
# Assign Value
metrics["Accuracy"] = metrics["Accuracy"]
# Copy DataFrame
metrics_cvs = metrics.copy()
# Output Results
metrics
| Name | Accuracy | Precision | Recall | F1-Score | |
|---|---|---|---|---|---|
| 1 | Logistic Regression | 99.4878 | 99.4887 | 99.4848 | 99.4868 |
| 2 | Linear SVC | 99.4432 | 99.4376 | 99.4470 | 99.4422 |
| 5 | LightGBM | 99.2650 | 99.2618 | 99.2656 | 99.2637 |
| 6 | Adaptive Boosting | 99.1314 | 99.1199 | 99.1414 | 99.1299 |
| 4 | XG Boost | 99.0980 | 99.0958 | 99.0968 | 99.0963 |
| 7 | CatBoost | 99.0200 | 99.0155 | 99.0211 | 99.0182 |
| 3 | Random Forest | 97.9176 | 97.9850 | 97.8635 | 97.9112 |
| 0 | Naive Bayes | 96.2806 | 96.3296 | 96.2297 | 96.2699 |
# Assign Chart Colours
chart_colours = [charcoalColour, tealColour, lightGreenColour, purpleColour]
chart_colours_dict = { "Accuracy": charcoalColour, "Precision": tealColour, "Recall": lightGreenColour, "F1-Score": purpleColour}
def addMetricFigure(fig, metric, row, col):
# Plot Bar Chart
fig_metric = go.Figure()
# Assign Metrics
fig_metric.add_trace(go.Scatter(x=metrics["Name"], y=metrics[metric], mode="lines+markers", name=metric, marker=dict(color=chart_colours_dict[metric], size=8)))
# Assign Text
fig_metric.add_trace(go.Scatter(x=metrics["Name"], y=metrics[metric], mode="text", name=metric, text=metrics[metric].round(2), textposition="top right"))
# Add Sub Plots
fig.add_trace(fig_metric.data[0], row=row, col=col)
# Set up Sub Plots
fig = make_subplots(rows=2,
cols=2,
subplot_titles=("Accuracy", "Precision", "Recall", "F1-Score"))
# Add
addMetricFigure(fig, "Accuracy", 1, 1)
addMetricFigure(fig, "Precision", 1, 2)
addMetricFigure(fig, "Recall", 2, 1)
addMetricFigure(fig, "F1-Score", 2, 2)
# Update Title
fig.update_layout(title="Classification Models Subplots",
title_x=0.5,
height=800)
# Show Chart
fig.show()
# Plot Bar Chart
fig = px.bar(metrics,
x="Name",
y=["Accuracy", "Precision", "Recall", "F1-Score"],
barmode="group",
color_discrete_map=chart_colours_dict)
# Apply Titles and Axis Titles
fig.update_layout(title="Interactive Performance Metrics",
title_x=0.5,
height=700,
xaxis_title="Classification Model",
yaxis_title="Percentage")
# Limit Range
fig.update_yaxes(range=[95, 100])
# Show Chart
fig.show()
# Assign Metric Scoring
scores = {
"Accuracy": "accuracy",
"Precision": "precision",
"Recall": "recall",
"F1-Score": "f1"
}
def crossValidateMetric(model, name, X, y, cv=5):
# Print Status
#print(f"Started {name} - " + datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
# Search through Items
for metric, scoring in scores.items():
# Run Cross Validation Score
results = cross_val_score(model, X, y, cv=cv, scoring=scoring, n_jobs=3)
# Assign Value
row = metrics_cvs[metrics_cvs["Name"] == name]
# Assign Values
metrics_cvs.loc[row.index, metric + " CVS"] = np.mean(results) * 100
# Print Status
#print(f"Completed {name} - " + datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
# Assign Value
cv = 3
# Cross Validation Classification Models
crossValidateMetric(model_nb, classificationName_nb, X, y, cv)
crossValidateMetric(model_lr, classificationName_lr, X, y, cv)
crossValidateMetric(model_svm, classificationName_svm, X, y, cv)
crossValidateMetric(model_rf, classificationName_rf, X, y, cv)
crossValidateMetric(model_xg, classificationName_xg, X, y, cv)
crossValidateMetric(model_lg, classificationName_lg, X, y, cv)
crossValidateMetric(model_ab, classificationName_ab, X, y, cv)
crossValidateMetric(model_cb, classificationName_cb, X, y, cv)
# Print Out Metrics
metrics_cvs
| Name | Accuracy | Precision | Recall | F1-Score | Accuracy CVS | Precision CVS | Recall CVS | F1-Score CVS | |
|---|---|---|---|---|---|---|---|---|---|
| 1 | Logistic Regression | 99.4878 | 99.4887 | 99.4848 | 99.4868 | 98.521092 | 98.946011 | 97.954896 | 98.440256 |
| 2 | Linear SVC | 99.4432 | 99.4376 | 99.4470 | 99.4422 | 98.581229 | 98.848117 | 98.183686 | 98.507747 |
| 5 | LightGBM | 99.2650 | 99.2618 | 99.2656 | 99.2637 | 98.690365 | 98.959680 | 98.295746 | 98.622655 |
| 6 | Adaptive Boosting | 99.1314 | 99.1199 | 99.1414 | 99.1299 | 98.532229 | 98.621454 | 98.309754 | 98.459829 |
| 4 | XG Boost | 99.0980 | 99.0958 | 99.0968 | 99.0963 | 98.329547 | 98.733805 | 97.758790 | 98.239539 |
| 7 | CatBoost | 99.0200 | 99.0155 | 99.0211 | 99.0182 | 98.187002 | 97.919425 | 98.319092 | 98.108835 |
| 3 | Random Forest | 97.9176 | 97.9850 | 97.8635 | 97.9112 | 96.594503 | 98.534256 | 94.420320 | 96.475840 |
| 0 | Naive Bayes | 96.2806 | 96.3296 | 96.2297 | 96.2699 | 91.850416 | 95.863269 | 86.935612 | 90.934652 |
def analyseNewsArticle(title, text):
# Assign Sentence
sentence = title + " - " + text
# Apply Preprocessing
# Lowercase
sentence = sentence.lower()
# Remove URLs
sentence = removeUrls(sentence)
# Remove Punctuation
sentence = sentence.translate(removepunctuation)
# Remove Numbers
sentence = re.sub(r"\d+", "", sentence)
# Remove Unicode
sentence = removeUnicode(sentence)
# Lemmatize Words
sentence = lemmatizeText(sentence)
# Run through Pipelines
# Transform Sentence
sentence_vectorized = tfidfVectorizer_nb.transform([sentence])
# Make Predictions
prediction = classifier_nb.predict(sentence_vectorized)[0]
prediction = prediction + classifier_lr.predict(sentence_vectorized)[0]
prediction = prediction + classifier_svm.predict(sentence_vectorized)[0]
prediction = prediction + classifier_rf.predict(sentence_vectorized)[0]
prediction = prediction + classifier_xg.predict(sentence_vectorized)[0]
prediction = prediction + classifier_lg.predict(sentence_vectorized)[0]
prediction = prediction + classifier_ab.predict(sentence_vectorized)[0]
prediction = prediction + classifier_cb.predict(sentence_vectorized)[0]
# Check Value
if prediction >= 5:
return "Real"
else:
return "Fake"
# Initialize NewsAPI
newsapi = NewsApiClient(api_key=newsAPIKey)
# Get Top Headlines - BBC News / CNN / NBC News / Russian TV
top_headlines = newsapi.get_top_headlines(q="",
sources="bbc-news,cnn,nbc-news,rt",
language="en",
page_size=20)
# Check Value
if top_headlines["status"] == "ok":
# Print Spacer
display(Markdown(f"#\n# Fake / Real News Analaysis:\n#"))
# Go through each article
for article in top_headlines["articles"]:
# Assign Values
title = article["title"]
text = article["content"]
url = article["url"]
publishedDate = article["publishedAt"]
publishedDate = publishedDate.replace("T", " ")
publishedDate = publishedDate[:10]
titleLink = f'<a href="{url}" target="_blank">{title}</a>'
# Check Text
if text is not None:
# Analyse Article
status = analyseNewsArticle(title, text)
# Check article if Real / Fake
display(Markdown(f"**Title:** {titleLink}\n**Published Date:** {publishedDate}\n**Status:** {status}"))
# Print Spacer
display(Markdown("#"))
Title: Italian man crushed to death under falling cheese wheels Published Date: 2023-08-07 Status: Real
Title: Tou Thao: Ex-officer in George Floyd case gets 57 months for role in killing Published Date: 2023-08-07 Status: Real
Title: Brick Lane: Chinese political slogans appear on London street art wall Published Date: 2023-08-07 Status: Real
Title: Hank the Tank: Fugitive burglar bear captured in California Published Date: 2023-08-07 Status: Real
Title: A Moscow summer with war on people's minds Published Date: 2023-08-07 Status: Real
Title: Ex-FBI counterintelligence chief in talks to plead guilty over work he did for Russian oligarch Published Date: 2023-08-07 Status: Real
Title: Matty Healy: The 1975 threatened with legal action after Malaysia festival cancellation Published Date: 2023-08-07 Status: Real
Title: Ahead of Ohio abortion vote, Republicans try to change the rules Published Date: 2023-08-07 Status: Real
Title: Only 1 in 5 adults with an opioid use disorder received medication to treat it in 2021 Published Date: 2023-08-07 Status: Real
Title: 162 infant deaths have been associated with nursing pillows since 2007, investigation finds Published Date: 2023-08-07 Status: Real
Title: 1 hurt in a possible explosion at a Sherwin-Williams paint factory plant in Texas Published Date: 2023-08-07 Status: Real
Title: Zelenskyy assassination plot: Ukraine detains Russian informant suspect Published Date: 2023-08-07 Status: Real
Title: The new liberal majority on the Wisconsin Supreme Court is off to a tense start Published Date: 2023-08-07 Status: Real
Title: Angus Cloud: Euphoria star's mother says his death was 'not intentional' Published Date: 2023-08-07 Status: Real
Title: More gay men can give blood as ‘one of the most significant changes in blood banking history’ gets underway Published Date: 2023-08-07 Status: Real
Title: Top librarian calls 'Marxist lesbian' tweet backlash 'regrettable' Published Date: 2023-08-07 Status: Real
Title: 'Of course he lost': Ron DeSantis rejects Trump's 2020 election claims Published Date: 2023-08-07 Status: Real
Title: Daimler Truck finance chief Jochen Goetz dies in ‘tragic incident’ Published Date: 2023-08-07 Status: Real
Title: WATCH: Moment Alaska house collapses into river Published Date: 2023-08-07 Status: Real